#!/usr/bin/env python3
"""
Test suite for StringZilla package.
For full coverage, preinstall NumPy and PyArrow.
To run locally:

    uv pip install numpy pyarrow pytest pytest-repeat
    uv pip install -e . --force-reinstall --no-build-isolation
    uv run --no-project python -m pytest scripts/test_stringzilla.py -s -x

Recommended flags for better diagnostics:

    -s                  show test output (no capture)
    -vv                 verbose output
    --maxfail=1         stop at first failure
    --full-trace        full Python tracebacks
    -k <pattern>        filter tests by substring
    -X faulthandler     to dump on fatal signals
    --verbose           enable verbose output

Example:

    uv pip install -e . --force-reinstall --no-build-isolation --verbose
    uv run --no-project python -X faulthandler -m pytest scripts/test_stringzilla.py -s -vv --maxfail=1 --full-trace
"""

import os
import sys
import math
import tempfile
import platform
import hashlib
import hmac
from random import choice, randint, seed
from string import ascii_lowercase
from typing import Optional, Sequence, Dict

import pytest

# Import shared Unicode data loading functions
from test_helpers import (
    UNICODE_VERSION,
    get_unicode_xml_data,
    parse_case_folding_file,
    get_case_folding_rules,
    get_case_folding_rules_as_codepoints,
    get_normalization_props,
)

import stringzilla as sz
from stringzilla import Str, Strs

# NumPy is available on most platforms and is required for most tests.
# ! When using PyPy on some platforms NumPy has internal issues, that will
# ! raise a weird error, not an `ImportError`. That's why we intentionally
# ! use a naked `except:`. Necessary evil!
try:
    import numpy as np

    numpy_available = True
except:  # noqa: E722
    # NumPy is not installed, most tests will be skipped
    numpy_available = False


# PyArrow is not available on most platforms.
# ! When using PyPy on some platforms PyArrow has internal issues, that will
# ! raise a weird error, not an `ImportError`. That's why we intentionally
# ! use a naked `except:`. Necessary evil!
try:
    import pyarrow as pa

    pyarrow_available = True
except:  # noqa: E722
    # PyArrow is not installed, most tests will be skipped
    pyarrow_available = False

# Reproducible test seeds for consistent CI runs (keep in sync with test_stringzillas.py)
SEED_VALUES = [
    42,  # Classic test seed
    0,  # Edge case: zero seed
    1,  # Minimal positive seed
    314159,  # Pi digits
]


@pytest.fixture(scope="session", autouse=True)
def log_test_environment():
    """Automatically log environment info before running any tests."""

    print()  # New line for better readability
    print("=== StringZilla Test Environment ===")
    print(f"Platform: {platform.platform()}")
    print(f"Architecture: {platform.machine()}")
    print(f"Processor: {platform.processor()}")
    print(f"Python: {platform.python_version()}")
    print(f"StringZilla version: {sz.__version__}")
    print(f"StringZilla capabilities: {sorted(sz.__capabilities__)}")
    print(f"NumPy available: {numpy_available}")
    if numpy_available:
        print(f"NumPy version: {np.__version__}")
    print(f"PyArrow available: {pyarrow_available}")
    if pyarrow_available:
        print(f"PyArrow version: {pa.__version__}")

    # If QEMU is indicated via env (e.g., set by pyproject), mask out SVE/SVE2 to avoid emulation flakiness.
    is_qemu = os.environ.get("SZ_IS_QEMU_", "").lower() in ("1", "true", "yes", "on")
    if is_qemu:
        sve_like = {"sve", "sve2", "sve2+aes"}
        current = list(getattr(sz, "__capabilities__", ()))
        desired = tuple(c for c in current if c.lower() not in sve_like)
        if len(desired) != len(current):
            print(f"QEMU env detected; disabling {sve_like} for stability")
            sz.reset_capabilities(desired)

    print("=" * 40)
    print()  # New line for better readability


def seed_random_generators(seed_value: Optional[int] = None):
    """Seed Python and NumPy RNGs for reproducibility."""
    if seed_value is None:
        return
    seed(seed_value)
    # Try to seed NumPy's random number generator
    # This handles both NumPy 1.x and 2.x, and any import issues
    if numpy_available:
        try:
            np.random.seed(seed_value)
        except (ImportError, AttributeError, Exception):
            pass


def test_library_properties():
    assert len(sz.__version__.split(".")) == 3, "Semantic versioning must be preserved"
    assert "serial" in sz.__capabilities__, "Serial backend must be present"
    assert isinstance(sz.__capabilities_str__, str) and len(sz.__capabilities_str__) > 0
    sz.reset_capabilities(sz.__capabilities__)  # Should not raise


@pytest.mark.parametrize("native_type", [str, bytes, bytearray])
def test_unit_construct(native_type):
    native = "aaaaa"
    if native_type is bytes or native_type is bytearray:
        native = native_type(native, "utf-8")
    big = Str(native)
    assert len(big) == len(native)


def test_str_repr():
    native = "abcdef"
    big = Str(native)
    assert repr(big) == f"sz.Str({repr(native)})"
    assert str(big) == native


def test_unit_indexing():
    native = "abcdef"
    big = Str(native)
    for i in range(len(native)):
        assert big[i] == native[i]


def test_unit_count():
    native = "aaaaa"
    big = Str(native)
    assert big.count("a") == 5
    assert big.count("aa") == 2
    assert big.count("aa", allowoverlap=True) == 4


def test_unit_count_byteset():
    native = "abcdef"
    big = Str(native)

    assert big.count_byteset("abc") == 3  # a, b, c
    assert big.count_byteset("xyz") == 0  # no matches
    assert big.count_byteset("aeiou") == 2  # a and e

    # Empty inputs
    assert sz.count_byteset("", "abc") == 0
    assert sz.count_byteset("abc", "") == 0
    assert sz.count_byteset("", "") == 0

    # Single character set
    assert sz.count_byteset("hello", "l") == 2
    assert sz.count_byteset("hello", "x") == 0

    # Repeated patterns
    assert sz.count_byteset("mississippi", "si") == 8  # s:4, i:4 total
    assert sz.count_byteset("aaaaaa", "a") == 6

    # Test start/end bounds
    native = "abcdefghij"
    big = Str(native)

    assert big.count_byteset("abc", 0, 3) == 3  # "abc"
    assert big.count_byteset("abc", 1, 3) == 2  # "bc"
    assert big.count_byteset("abc", 3) == 0  # "defghij"
    assert big.count_byteset("hij", 7) == 3  # "hij"
    assert big.count_byteset("hij", -3) == 3  # last 3 chars "hij"
    assert big.count_byteset("abc", 0, -7) == 3  # first 3 chars "abc"

    # Test edge cases
    assert sz.count_byteset("a", "a", 0, 0) == 0  # empty slice
    assert sz.count_byteset("abc", "abc", 10, 20) == 0  # out of bounds
    assert sz.count_byteset("abc", "abc", -10, -5) == 0  # negative out of bounds


def test_unit_contains():
    big = Str("abcdef")
    assert "a" in big
    assert "ab" in big
    assert "xxx" not in big


def test_unit_str_rich_comparisons():
    # Equality
    assert Str("aa") == "aa"
    assert Str("a") != "b"
    assert Str("abc") == Str("abc")
    assert Str("abc") != Str("abd")

    # Less than and less than or equal to
    assert Str("aa") < "b"
    assert Str("ab") <= "ab"
    assert Str("a") < Str("b")
    assert Str("abc") <= Str("abcd")

    # Greater than and greater than or equal to
    assert Str("b") > "aa"
    assert Str("ab") >= "ab"
    assert Str("b") > Str("a")
    assert Str("abcd") >= Str("abc")

    # Slicing and comparisons
    s2 = Str("abb")
    assert s2[1:] == "bb"
    assert s2[:-1] == "ab"
    assert s2[-1:] == "b"
    assert s2[1:] != "abb"
    assert s2[:-2] == "a"
    assert s2[-2:] == "bb"


@pytest.mark.skipif(not numpy_available, reason="NumPy is not installed")
def test_unit_buffer_protocol():
    """Tests weather conversion to and from the buffer protocol works as expected."""

    # Convert from StringZilla string to NumPy array through buffer protocol
    sz_str = Str("hello")
    np_array = np.array(sz_str)
    assert np_array.dtype == np.dtype("c")
    assert np_array.shape == (len("hello"),)
    assert "".join([c.decode("utf-8") for c in np_array.tolist()]) == "hello"

    # Convert from NumPy array to StringZilla string through buffer protocol
    np_array = np.arange(ord("a"), ord("z"), dtype=np.uint8)
    sz_str = sz.Str(memoryview(np_array))
    assert len(np_array) == len(sz_str)

    # Make sure multi-dimensional contiguous arrays are supported for image processing
    np_array = np.arange(ord("a"), ord("a") + 8, dtype=np.uint8).reshape((2, 2, 2))
    sz_str = sz.Str(memoryview(np_array))
    assert np_array.size == len(sz_str)

    # Expect errors if the data is not contiguous
    np_array = np.arange(ord("a"), ord("z"), dtype=np.uint8)[::2]
    with pytest.raises(ValueError):
        sz.Str(memoryview(np_array))

    # Expect errors if the data is not passed through a `memoryview`
    with pytest.raises(TypeError):
        sz.Str(np.array())
    # Expect errors if the data is not string-like
    with pytest.raises(TypeError):
        sz.Str(dict())


def test_str_write_to():
    native = "line1\nline2\nline3"
    big = Str(native)

    # Create a temporary file
    with tempfile.NamedTemporaryFile(delete=False) as tmpfile:
        temp_filename = tmpfile.name  # Store the name for later use

    try:
        big.write_to(temp_filename)
        with open(temp_filename, "r") as file:
            content = file.read()
            assert content == native, "The content of the file does not match the expected output"
    finally:
        os.remove(temp_filename)


def test_unit_split():
    native = "line1\nline2\nline3"
    big = Str(native)

    # Splitting using a string
    lines = sz.split(big, "\n")
    assert lines == ["line1", "line2", "line3"]

    lines = sz.rsplit(big, "\n")
    assert lines == ["line1", "line2", "line3"]

    lines = sz.split(big, "\n", keepseparator=True)
    assert lines == ["line1\n", "line2\n", "line3"]

    letters = sz.split("a b c d")
    assert letters == ["a", "b", "c", "d"]

    # Splitting using character sets
    letters = sz.split_byteset("a b_c d", " _")
    assert letters == ["a", "b", "c", "d"]

    letters = sz.rsplit_byteset("a b_c d", " _")
    assert letters == ["a", "b", "c", "d"]

    # Check for equivalence with native Python strings for newline separators
    assert native.splitlines() == list(big.splitlines())
    assert native.splitlines(True) == list(big.splitlines(keeplinebreaks=True))

    # Check for equivalence with native Python strings, including SWAR boundary conditions
    assert native.split("l") == list(big.split("l"))
    assert native.split("li") == list(big.split("li"))
    assert native.split("lin") == list(big.split("lin"))
    assert native.split("line") == list(big.split("line"))
    assert native.split("line1") == list(big.split("line1"))
    assert native.split("line3") == list(big.split("line3"))
    assert native.split("\n", maxsplit=0) == list(big.split("\n", maxsplit=0))
    assert native.split("\n", maxsplit=1) == list(big.split("\n", maxsplit=1))
    assert native.split("\n", maxsplit=2) == list(big.split("\n", maxsplit=2))
    assert native.split("\n", maxsplit=3) == list(big.split("\n", maxsplit=3))
    assert native.split("\n", maxsplit=4) == list(big.split("\n", maxsplit=4))

    # Check for equivalence with native Python strings in reverse order, including boundary conditions
    assert native.rsplit("line1") == list(big.rsplit("line1"))
    assert native.rsplit("line3") == list(big.rsplit("line3"))
    assert native.rsplit("\n", maxsplit=0) == list(big.rsplit("\n", maxsplit=0))
    assert native.rsplit("\n", maxsplit=1) == list(big.rsplit("\n", maxsplit=1))
    assert native.rsplit("\n", maxsplit=2) == list(big.rsplit("\n", maxsplit=2))
    assert native.rsplit("\n", maxsplit=3) == list(big.rsplit("\n", maxsplit=3))
    assert native.rsplit("\n", maxsplit=4) == list(big.rsplit("\n", maxsplit=4))

    # If the passed separator is an empty string, the library must raise a `ValueError`
    with pytest.raises(ValueError):
        sz.split(big, "")
    with pytest.raises(ValueError):
        sz.rsplit(big, "")
    with pytest.raises(ValueError):
        sz.split_byteset(big, "")
    with pytest.raises(ValueError):
        sz.rsplit_byteset(big, "")


def test_unit_split_iterators():
    """
    Test the iterator-based split methods.
    This is slightly different from `split` and `rsplit` in that it returns an iterator instead of a list.
    Moreover, the native `rsplit` and even `rsplit_byteset` report results in the identical order to `split`
    and `split_byteset`. Here `rsplit_iter` reports elements in the reverse order, compared to `split_iter`.
    """
    native = "line1\nline2\nline3"
    big = Str(native)

    # Splitting using a string
    lines = list(sz.split_iter(big, "\n"))
    assert lines == ["line1", "line2", "line3"]

    lines = list(sz.rsplit_iter(big, "\n"))
    assert lines == ["line3", "line2", "line1"]

    lines = list(sz.split_iter(big, "\n", keepseparator=True))
    assert lines == ["line1\n", "line2\n", "line3"]

    lines = list(sz.rsplit_iter(big, "\n", keepseparator=True))
    assert lines == ["\nline3", "\nline2", "line1"]

    letters = list(sz.split_iter("a b c d"))
    assert letters == ["a", "b", "c", "d"]

    # Splitting using character sets
    letters = list(sz.split_byteset_iter("a-b_c-d", "-_"))
    assert letters == ["a", "b", "c", "d"]

    letters = list(sz.rsplit_byteset_iter("a-b_c-d", "-_"))
    assert letters == ["d", "c", "b", "a"]

    # Check for equivalence with native Python strings, including boundary conditions
    assert native.split("line1") == list(big.split_iter("line1"))
    assert native.split("line3") == list(big.split_iter("line3"))
    assert native.split("\n", maxsplit=0) == list(big.split_iter("\n", maxsplit=0))
    assert native.split("\n", maxsplit=1) == list(big.split_iter("\n", maxsplit=1))
    assert native.split("\n", maxsplit=2) == list(big.split_iter("\n", maxsplit=2))
    assert native.split("\n", maxsplit=3) == list(big.split_iter("\n", maxsplit=3))
    assert native.split("\n", maxsplit=4) == list(big.split_iter("\n", maxsplit=4))

    def rlist(seq):
        seq = list(seq)
        seq.reverse()
        return seq

    # Check for equivalence with native Python strings in reverse order, including boundary conditions
    assert native.rsplit("line1") == rlist(big.rsplit_iter("line1"))
    assert native.rsplit("line3") == rlist(big.rsplit_iter("line3"))
    assert native.rsplit("\n", maxsplit=0) == rlist(big.rsplit_iter("\n", maxsplit=0))
    assert native.rsplit("\n", maxsplit=1) == rlist(big.rsplit_iter("\n", maxsplit=1))
    assert native.rsplit("\n", maxsplit=2) == rlist(big.rsplit_iter("\n", maxsplit=2))
    assert native.rsplit("\n", maxsplit=3) == rlist(big.rsplit_iter("\n", maxsplit=3))
    assert native.rsplit("\n", maxsplit=4) == rlist(big.rsplit_iter("\n", maxsplit=4))

    # If the passed separator is an empty string, the library must raise a `ValueError`
    with pytest.raises(ValueError):
        sz.split_iter(big, "")
    with pytest.raises(ValueError):
        sz.rsplit_iter(big, "")
    with pytest.raises(ValueError):
        sz.split_byteset_iter(big, "")
    with pytest.raises(ValueError):
        sz.rsplit_byteset_iter(big, "")


def test_unit_strip():
    # Test with whitespace (default behavior)
    native_whitespace = "  \t\n hello world \r\f\v  "
    big_whitespace = Str(native_whitespace)

    # Test lstrip
    assert native_whitespace.lstrip() == str(sz.lstrip(big_whitespace))
    assert native_whitespace.lstrip() == str(big_whitespace.lstrip())

    # Test rstrip
    assert native_whitespace.rstrip() == str(sz.rstrip(big_whitespace))
    assert native_whitespace.rstrip() == str(big_whitespace.rstrip())

    # Test strip
    assert native_whitespace.strip() == str(sz.strip(big_whitespace))
    assert native_whitespace.strip() == str(big_whitespace.strip())

    # Test with custom character set
    native_custom = "aaabbbhello worldcccaaa"
    big_custom = Str(native_custom)
    chars = "abc"

    # Test lstrip with custom chars
    assert native_custom.lstrip(chars) == str(sz.lstrip(big_custom, chars))
    assert native_custom.lstrip(chars) == str(big_custom.lstrip(chars))

    # Test rstrip with custom chars
    assert native_custom.rstrip(chars) == str(sz.rstrip(big_custom, chars))
    assert native_custom.rstrip(chars) == str(big_custom.rstrip(chars))

    # Test strip with custom chars
    assert native_custom.strip(chars) == str(sz.strip(big_custom, chars))
    assert native_custom.strip(chars) == str(big_custom.strip(chars))

    # Test edge cases
    # Empty string
    empty = ""
    big_empty = Str(empty)
    assert empty.strip() == str(sz.strip(big_empty))
    assert empty.strip() == str(big_empty.strip())

    # String with only whitespace
    only_whitespace = " \t\n\r\f\v "
    big_only_whitespace = Str(only_whitespace)
    assert only_whitespace.strip() == str(sz.strip(big_only_whitespace))
    assert only_whitespace.strip() == str(big_only_whitespace.strip())

    # String with only custom chars
    only_custom = "aaabbbccc"
    big_only_custom = Str(only_custom)
    assert only_custom.strip("abc") == str(sz.strip(big_only_custom, "abc"))
    assert only_custom.strip("abc") == str(big_only_custom.strip("abc"))

    # String with no chars to strip
    no_strip = "hello world"
    big_no_strip = Str(no_strip)
    assert no_strip.strip() == str(sz.strip(big_no_strip))
    assert no_strip.strip() == str(big_no_strip.strip())

    # Test with bytes
    native_bytes = b"  hello world  "
    big_bytes = Str(native_bytes)
    assert native_bytes.strip() == bytes(sz.strip(big_bytes))
    assert native_bytes.strip() == bytes(big_bytes.strip())

    # Test asymmetric stripping
    native_asymmetric = "aaahello worldbbb"
    big_asymmetric = Str(native_asymmetric)

    # Only strip 'a' from left
    assert native_asymmetric.lstrip("a") == str(sz.lstrip(big_asymmetric, "a"))
    assert native_asymmetric.lstrip("a") == str(big_asymmetric.lstrip("a"))

    # Only strip 'b' from right
    assert native_asymmetric.rstrip("b") == str(sz.rstrip(big_asymmetric, "b"))
    assert native_asymmetric.rstrip("b") == str(big_asymmetric.rstrip("b"))

    # Test with special characters
    native_special = "!!!###hello world***!!!"
    big_special = Str(native_special)
    special_chars = "!#*"

    assert native_special.strip(special_chars) == str(sz.strip(big_special, special_chars))
    assert native_special.strip(special_chars) == str(big_special.strip(special_chars))

    # Test with single character
    native_single = "aaa"
    big_single = Str(native_single)
    assert native_single.strip("a") == str(sz.strip(big_single, "a"))
    assert native_single.strip("a") == str(big_single.strip("a"))

    # Test with no matching characters
    native_no_match = "hello world"
    big_no_match = Str(native_no_match)
    assert native_no_match.strip("xyz") == str(sz.strip(big_no_match, "xyz"))
    assert native_no_match.strip("xyz") == str(big_no_match.strip("xyz"))


def test_unit_strs_sequence():
    native = "p3\np2\np1"
    big = Str(native)

    lines = big.splitlines()
    assert [2, 1, 0] == list(lines.argsort())
    assert "p3" in lines
    assert "p4" not in lines

    assert repr(lines) == "sz.Strs(['p3', 'p2', 'p1'])"
    assert repr(Str("a" * 1_000_000).split()).endswith("... ])")

    assert str(lines) == "['p3', 'p2', 'p1']"
    assert str(Str("a" * 1_000_000).split()).startswith("['aaa")
    assert str(Str("a" * 1_000_000).split()).endswith("aaa']")

    lines_sorted = lines.sorted()
    assert [0, 1, 2] == list(lines_sorted.argsort())
    assert ["p1", "p2", "p3"] == list(lines_sorted)

    # Reverse order
    assert [2, 1, 0] == list(lines_sorted.argsort(reverse=True))
    lines_sorted_reverse = lines.sorted(reverse=True)
    assert ["p3", "p2", "p1"] == list(lines_sorted_reverse)

    # Sampling an array
    sampled = lines.sample(100, seed=42)
    assert "p3" in sampled
    assert "p4" not in sampled


def test_unit_slicing():
    native = "abcdef"
    big = Str(native)
    assert big[1:3] == "bc"
    assert big[1:] == "bcdef"
    assert big[:3] == "abc"
    assert big[-1:] == "f"
    assert big[:-1] == "abcde"
    assert big[-3:] == "def"
    assert big[:-3] == "abc"


def test_unit_strs_rich_comparisons():
    arr: Strs = Str("a b c d e f g h").split()

    # Test against another Strs object
    identical_arr: Strs = Str("a b c d e f g h").split()
    different_arr: Strs = Str("a b c d e f g i").split()
    shorter_arr: Strs = Str("a b c d e").split()
    longer_arr: Strs = Str("a b c d e f g h i j").split()

    assert arr == identical_arr
    assert arr != different_arr
    assert arr != shorter_arr
    assert arr != longer_arr
    assert shorter_arr < arr
    assert longer_arr > arr

    # Test against a Python list and a tuple
    list_equal = ["a", "b", "c", "d", "e", "f", "g", "h"]
    list_different = ["a", "b", "c", "d", "x", "f", "g", "h"]
    tuple_equal = ("a", "b", "c", "d", "e", "f", "g", "h")
    tuple_different = ("a", "b", "c", "d", "e", "f", "g", "i")

    assert arr == list_equal
    assert arr != list_different
    assert arr == tuple_equal
    assert arr != tuple_different

    # Test against a generator of unknown length
    generator_equal = (x for x in "a b c d e f g h".split())
    generator_different = (x for x in "a b c d e f g i".split())
    generator_shorter = (x for x in "a b c d e".split())
    generator_longer = (x for x in "a b c d e f g h i j".split())

    assert arr == generator_equal
    assert arr != generator_different
    assert arr != generator_shorter
    assert arr != generator_longer


def test_unit_strs_sequence_slicing():
    native = "1, 2, 3, 4, 5, 6"
    big = Str(native)
    big_sequence = big.split(", ")

    def to_str(seq):
        return "".join([str(x) for x in seq])

    assert big_sequence[1:3] == ["2", "3"], to_str(big_sequence[1:3])
    assert big_sequence[1:] == ["2", "3", "4", "5", "6"], to_str(big_sequence[1:])
    assert big_sequence[:3] == ["1", "2", "3"], to_str(big_sequence[:3])

    # Use negative indices to slice from the end
    assert big_sequence[-1:] == ["6"], to_str(big_sequence[-1:])
    assert big_sequence[:-1] == ["1", "2", "3", "4", "5"], to_str(big_sequence[:-1])
    assert big_sequence[-3:] == ["4", "5", "6"], to_str(big_sequence[-3:])
    assert big_sequence[:-3] == ["1", "2", "3"], to_str(big_sequence[:-3])

    # Introduce a step to skip some values
    assert big_sequence[::2] == ["1", "3", "5"], to_str(big_sequence[::2])
    assert big_sequence[::-1] == ["6", "5", "4", "3", "2", "1"], to_str(big_sequence[::-1])

    # Let's go harder with nested slicing
    assert big_sequence[1:][::-1] == ["6", "5", "4", "3", "2"]
    assert big_sequence[1:][::-2] == ["6", "4", "2"]
    assert big_sequence[1:][::-3] == ["6", "3"]

    # Edge cases: slices of length 1 and 0
    # Test single element slices (length 1)
    assert big_sequence[0:1] == ["1"], "Single element slice [0:1] failed"
    assert big_sequence[1:2] == ["2"], "Single element slice [1:2] failed"
    assert big_sequence[-1:] == ["6"], "Single element slice [-1:] failed"
    assert big_sequence[-2:-1] == ["5"], "Single element slice [-2:-1] failed"

    # Test empty slices (length 0)
    assert big_sequence[0:0] == [], "Empty slice [0:0] failed"
    assert big_sequence[1:1] == [], "Empty slice [1:1] failed"
    assert big_sequence[5:5] == [], "Empty slice [5:5] failed"
    assert big_sequence[10:10] == [], "Empty slice [10:10] out of bounds failed"
    assert big_sequence[1:0] == [], "Empty slice [1:0] (start > end) failed"
    assert big_sequence[3:1] == [], "Empty slice [3:1] (start > end) failed"

    # Test edge cases with pre-constructed Strs objects passed as slices
    single_slice = big_sequence[0:1]  # Length 1 slice
    empty_slice = big_sequence[0:0]  # Length 0 slice

    # Verify the slices work correctly when passed around
    assert len(single_slice) == 1, "Single slice length incorrect"
    assert len(empty_slice) == 0, "Empty slice length incorrect"
    assert str(single_slice[0]) == "1", "Single slice element access failed"

    # Test iteration over edge case slices
    single_items = [str(item) for item in single_slice]
    assert single_items == ["1"], "Single slice iteration failed"

    empty_items = [str(item) for item in empty_slice]
    assert empty_items == [], "Empty slice iteration failed"

    # Test repr and str don't crash on edge cases
    assert isinstance(repr(single_slice), str), "Single slice repr failed"
    assert isinstance(repr(empty_slice), str), "Empty slice repr failed"
    assert isinstance(str(single_slice), str), "Single slice str failed"
    assert isinstance(str(empty_slice), str), "Empty slice str failed"


def test_unit_globals():
    """Validates that the previously unit-tested member methods are also visible as global functions."""

    assert sz.find("abcdef", "bcdef") == 1
    assert sz.find("abcdef", "x") == -1
    assert sz.rfind("abcdef", "bcdef") == 1
    assert sz.rfind("abcdef", "x") == -1

    # Corner-cases for `find` and `rfind`, when we pass empty strings
    assert sz.find("abcdef", "") == "abcdef".find("")
    assert sz.rfind("abcdef", "") == "abcdef".rfind("")
    assert sz.find("abcdef", "", 1) == "abcdef".find("", 1)
    assert sz.rfind("abcdef", "", 1) == "abcdef".rfind("", 1)
    assert sz.find("abcdef", "", 1, 3) == "abcdef".find("", 1, 3)
    assert sz.rfind("abcdef", "", 1, 3) == "abcdef".rfind("", 1, 3)
    assert sz.find("", "abcdef") == "".find("abcdef")
    assert sz.rfind("", "abcdef") == "".rfind("abcdef")

    assert sz.find("Hello, world!", "world", 0, 11) == "Hello, world!".find("world", 0, 11)
    assert sz.rfind("Hello, world!", "world", 0, 11) == "Hello, world!".rfind("world", 0, 11)

    assert sz.find_first_of("abcdef", "cde") == 2
    assert sz.find_first_of("abcdef", "xyz") == -1
    assert sz.find_first_of("hello world", "aeiou") == 1

    assert sz.find_last_of("abcdef", "abc") == 2
    assert sz.find_last_of("abcdef", "xyz") == -1
    assert sz.find_last_of("hello world", "aeiou") == 7

    assert sz.find_first_not_of("aaabbbccc", "ab") == 6
    assert sz.find_first_not_of("abcdef", "abcdef") == -1
    assert sz.find_first_not_of("   hello", " ") == 3

    assert sz.find_last_not_of("aaabbbccc", "bc") == 2
    assert sz.find_last_not_of("abcdef", "abcdef") == -1
    assert sz.find_last_not_of("hello   ", " ") == 4

    # Test byteset counting
    assert sz.count_byteset("abcdef", "abc") == 3
    assert sz.count_byteset("abcdef", "xyz") == 0
    assert sz.count_byteset("hello world", "aeiou") == 3  # e, o, o
    assert sz.count_byteset("mississippi", "si") == 8  # s, i, s, s, i, s, s, i
    assert sz.count_byteset("", "abc") == 0
    assert sz.count_byteset("abc", "") == 0
    assert sz.count_byteset("abcdef", "abc", 1, 4) == 2  # bc in "bcd"
    assert sz.count_byteset("hello world", "aeiou", 2) == 2  # o, o
    assert sz.count_byteset("hello world", "aeiou", 0, 5) == 2  # e, o in "hello"

    # Compare partitioning functions
    assert sz.partition("abcdef", "c") == ("ab", "c", "def")
    assert sz.rpartition("abcdef", "c") == ("ab", "c", "def")

    with pytest.raises(ValueError):
        sz.partition("abcdef", "")
    with pytest.raises(ValueError):
        sz.rpartition("abcdef", "")

    assert sz.count("abcdef", "x") == 0
    assert sz.count("aaaaa", "a") == 5
    assert sz.count("aaaaa", "aa") == 2
    assert sz.count("aaaaa", "aa", allowoverlap=True) == 4

    assert sz.bytesum("hello") > 0
    assert sz.bytesum("hello") > sz.bytesum("hell")

    assert sz.translate("ABC", {"A": "X", "B": "Y", "C": "Z"}) == "XYZ"
    assert sz.translate("ABC", {"A": "X", "B": "Y"}) == "XYC"
    assert sz.translate("ABC", {"A": "X", "B": "Y"}, start=1, end=-1) == "YC"
    assert sz.translate("ABC", bytes(range(256))) == "ABC"
    with pytest.raises(TypeError):
        sz.translate("ABC", {"A": "X", "B": "Y"}, start=1, end=-1, inplace=True)

    mutable_buffer = bytearray(b"ABC")
    assert sz.fill_random(mutable_buffer) is None
    assert sz.fill_random(mutable_buffer, 42) is None

    assert sz.split("hello world test", " ") == ["hello", "world", "test"]
    assert sz.rsplit("hello world test", " ", 1) == ["hello world", "test"]


def test_string_lengths():
    assert 4 == len(sz.Str("abcd"))
    assert 8 == len(sz.Str("αβγδ"))


@pytest.mark.parametrize(
    "byte_string, encoding, expected",
    [
        (b"hello world", "utf-8", "hello world"),
        (b"\xf0\x9f\x98\x81", "utf-8", "😁"),  # Emoji
        (b"hello world", "ascii", "hello world"),
        (b"\xf0hello world", "latin-1", "ðhello world"),
        (b"", "utf-8", ""),  # Empty string case
    ],
)
def test_decoding_valid_strings(byte_string, encoding, expected):
    assert byte_string.decode(encoding) == expected
    assert sz.Str(byte_string).decode(encoding) == expected


@pytest.mark.parametrize(
    "byte_string, encoding",
    [
        # Use `bytes.fromhex()` to avoid putting binary literals in source code
        # This prevents PyTest's source parsing from encountering invalid UTF-8
        (bytes.fromhex("ff"), "utf-8"),  # Invalid UTF-8 byte
        (bytes.fromhex("80") + b"hello", "ascii"),  # Non-ASCII byte in ASCII string
    ],
)
def test_decoding_exceptions(byte_string, encoding):
    with pytest.raises(UnicodeDecodeError):
        byte_string.decode(encoding)
    with pytest.raises(UnicodeDecodeError):
        sz.Str(byte_string).decode(encoding)


def test_slice_of_split():
    def impl(native_str: str):
        native_split = native_str.split()
        text = sz.Str(native_str)
        sz_split = text.split()
        for slice_idx in range(len(native_split)):
            assert str(sz_split[slice_idx]) == native_split[slice_idx]

    native_str = "Weebles wobble before they fall down, don't they?"
    impl(native_str)
    # ~5GB to overflow 32-bit sizes
    copies = int(len(native_str) / 5e9)
    # Eek. Cover 64-bit indices
    impl(native_str * copies)


def get_random_string(
    length: Optional[int] = None,
    variability: Optional[int] = None,
) -> str:
    if length is None:
        length = randint(3, 300)
    if variability is None:
        variability = len(ascii_lowercase)
    return "".join(choice(ascii_lowercase[:variability]) for _ in range(length))


def is_equal_strings(native_strings, big_strings):
    for native_slice, big_slice in zip(native_strings, big_strings):
        assert native_slice == big_slice, f"Mismatch between `{native_slice}` and `{str(big_slice)}`"


def check_identical(
    native: str,
    big: Str,
    needle: Optional[str] = None,
    check_iterators: bool = False,
):
    if needle is None:
        part_offset = randint(0, len(native) - 1)
        part_length = randint(1, len(native) - part_offset)
        needle = native[part_offset:part_length]

    present_in_native: bool = needle in native
    present_in_big = needle in big
    assert present_in_native == present_in_big
    assert native.find(needle) == big.find(needle)
    assert native.rfind(needle) == big.rfind(needle)
    assert native.count(needle) == big.count(needle)

    # Check that the `start` and `stop` positions are correctly inferred
    len_half = len(native) // 2
    len_quarter = len(native) // 4
    assert native.find(needle, len_half) == big.find(needle, len_half)
    assert native.find(needle, len_quarter, 3 * len_quarter) == big.find(needle, len_quarter, 3 * len_quarter)

    # Check splits and other sequence operations
    native_strings = native.split(needle)
    big_strings: Strs = big.split(needle)
    assert len(native_strings) == len(big_strings)

    if check_iterators:
        for i in range(len(native_strings)):
            assert len(native_strings[i]) == len(big_strings[i])
            assert (
                native_strings[i] == big_strings[i]
            ), f"Mismatch between `{native_strings[i]}` and `{str(big_strings[i])}`"
            assert [c for c in native_strings[i]] == [c for c in big_strings[i]]

    is_equal_strings(native_strings, big_strings)


@pytest.mark.parametrize("repetitions", range(1, 10))
def test_fuzzy_repetitions(repetitions: int):
    native = "abcd" * repetitions
    big = Str(native)

    check_identical(native, big, "a", True)
    check_identical(native, big, "ab", True)
    check_identical(native, big, "abc", True)
    check_identical(native, big, "abcd", True)
    check_identical(native, big, "abcde", True)  # Missing pattern


@pytest.mark.parametrize("pattern_length", [1, 2, 3, 4, 5])
@pytest.mark.parametrize("haystack_length", range(1, 65))
@pytest.mark.parametrize("variability", range(1, 25))
@pytest.mark.parametrize("seed_value", SEED_VALUES)
def test_fuzzy_substrings(pattern_length: int, haystack_length: int, variability: int, seed_value: int):
    seed_random_generators(seed_value)
    native = get_random_string(variability=variability, length=haystack_length)
    big = Str(native)
    pattern = get_random_string(variability=variability, length=pattern_length)
    assert (pattern in native) == big.contains(
        pattern
    ), f"Failed to check if {pattern} at offset {native.find(pattern)} is present in {native}"
    assert native.find(pattern) == big.find(
        pattern
    ), f"Failed to locate {pattern} at offset {native.find(pattern)} in {native}"


def baseline_translate(body: str, lut: Sequence) -> str:
    return "".join([chr(lut[ord(c)]) for c in body])


def translation_table_to_dict(lut: Sequence) -> Dict[int, str]:
    """Convert lookup table to translation dict for str.translate()"""
    return {i: chr(lut[i]) for i in range(256)}


@pytest.mark.skipif(not numpy_available, reason="NumPy is not installed")
@pytest.mark.parametrize("length", range(1, 300))
@pytest.mark.parametrize("seed_value", SEED_VALUES)
def test_translations(length: int, seed_value: int):
    seed_random_generators(seed_value)

    map_identity = np.arange(256, dtype=np.uint8)
    map_invert = np.arange(255, -1, -1, dtype=np.uint8)
    map_threshold = np.where(np.arange(256) > 127, 255, 0).astype(np.uint8)
    dict_identity = translation_table_to_dict(map_identity)
    dict_invert = translation_table_to_dict(map_invert)
    dict_threshold = translation_table_to_dict(map_threshold)
    view_identity = memoryview(map_identity)
    view_invert = memoryview(map_invert)
    view_threshold = memoryview(map_threshold)

    body = get_random_string(length=length)
    body_bytes = body.encode("utf-8")

    # Check mapping strings and byte-strings into new strings
    assert sz.translate(body, view_identity) == body
    assert sz.translate(body_bytes, view_identity) == body_bytes
    assert sz.translate(body_bytes, view_identity) == body_bytes.translate(view_identity)
    assert sz.translate(body_bytes, view_invert) == body_bytes.translate(view_invert)
    assert sz.translate(body_bytes, view_threshold) == body_bytes.translate(view_threshold)

    # Check in-place translations on mutable byte-arrays - all of them return nothing
    after_identity = bytearray(body_bytes)
    assert sz.translate(after_identity, view_identity, inplace=True) is None
    assert sz.equal(after_identity, body.translate(dict_identity))
    after_invert = bytearray(body_bytes)
    assert sz.translate(after_invert, view_invert, inplace=True) is None
    assert sz.equal(after_invert, body.translate(dict_invert))
    after_threshold = bytearray(body_bytes)
    assert sz.translate(after_threshold, view_threshold, inplace=True) is None
    assert sz.equal(after_threshold, body.translate(dict_threshold))


@pytest.mark.parametrize("length", list(range(0, 300)) + [1024, 4096, 100000])
@pytest.mark.skipif(not numpy_available, reason="NumPy is not installed")
@pytest.mark.parametrize("seed_value", SEED_VALUES)
def test_translations_random(length: int, seed_value: int):
    seed_random_generators(seed_value)
    body = get_random_string(length=length)
    lut = np.random.randint(0, 256, size=256, dtype=np.uint8)
    assert sz.translate(body, memoryview(lut)) == baseline_translate(body, lut)


@pytest.mark.parametrize("seed_value", SEED_VALUES)
def test_fill_random_slice(seed_value: int):
    # Prepare a zeroed buffer and keep a copy for comparison
    original = bytearray(64)
    updated_in_slices = bytearray(original)

    # Fill only a slice [start:end) deterministically
    start, end = 10, 30
    sz.fill_random(updated_in_slices, nonce=seed_value, start=start, end=end)

    # Unchanged prefix and suffix
    assert bytes(updated_in_slices[:start]) == bytes(original[:start])
    assert bytes(updated_in_slices[end:]) == bytes(original[end:])

    # Changed inner region
    assert bytes(updated_in_slices[start:end]) != bytes(original[start:end])


def test_fill_random_different_nonces():
    first_buffer = bytearray(64)
    second_buffer = bytearray(64)
    sz.fill_random(first_buffer, nonce=1)
    sz.fill_random(second_buffer, nonce=2)
    assert bytes(first_buffer) != bytes(second_buffer)


@pytest.mark.parametrize("length", [0, 1, 7, 64])
@pytest.mark.parametrize("seed_value", SEED_VALUES)
def test_fill_random_alphabet(length: int, seed_value: int):

    # Same nonce should produce the same result
    random_string = sz.random(length, nonce=seed_value)
    same_nonce_random_string = sz.random(length, nonce=seed_value)
    assert isinstance(random_string, (bytes, bytearray))
    assert len(random_string) == length
    assert random_string == same_nonce_random_string

    # With alphabet: all bytes must belong to alphabet
    alphabet = b"0123456789"
    random_digits = sz.random(128, nonce=seed_value, alphabet=alphabet)
    assert set(random_digits).issubset(set(alphabet))


@pytest.mark.parametrize("body", ["", "hello", "world", "abcdefg", "a" * 32])
@pytest.mark.parametrize("seed_value", SEED_VALUES)
def test_hash_basic_equivalence(body: str, seed_value: int):
    # TODO: Add streaming hashers and compare slices vs overall
    hash_seeded = sz.hash(body, seed=seed_value)
    hash_member = sz.Str(body).hash(seed=seed_value)
    assert hash_seeded == hash_member


@pytest.mark.parametrize("seed_value", SEED_VALUES)
def test_hasher_incremental_vs_one_shot(seed_value: int):
    data_full = b"hello world"
    data_prefix = b"hello "
    data_suffix = b"world"

    hasher = sz.Hasher(seed=seed_value)
    hasher.update(data_prefix)
    hasher.update(data_suffix)
    streamed_hash = hasher.digest()

    expected_hash = sz.hash(data_full, seed=seed_value)
    assert isinstance(streamed_hash, int)
    assert streamed_hash == expected_hash


@pytest.mark.parametrize("seed_value", SEED_VALUES)
def test_hasher_reset_and_hexdigest(seed_value: int):
    data = b"some test payload"
    hasher = sz.Hasher(seed=seed_value)
    hasher.update(data)
    streamed_hash = hasher.digest()
    streamed_hex = hasher.hexdigest()
    assert isinstance(streamed_hex, str) and len(streamed_hex) == 16 and streamed_hex == format(streamed_hash, "016x")

    hasher.reset()
    hasher.update(data)
    re_streamed_hash = hasher.digest()
    re_streamed_hex = hasher.hexdigest()
    assert streamed_hash == re_streamed_hash
    assert streamed_hex == re_streamed_hex


@pytest.mark.parametrize("length", list(range(0, 300)) + [1024, 4096, 100000])
@pytest.mark.parametrize("seed_value", SEED_VALUES)
def test_bytesum_random(length: int, seed_value: int):
    def sum_bytes(body: str) -> int:
        return sum([ord(c) for c in body])

    seed_random_generators(seed_value)
    body = get_random_string(length=length)
    assert sum_bytes(body) == sz.bytesum(body)


@pytest.mark.parametrize("length", [0, 1, 3, 7, 15, 31, 63, 64, 65, 127, 128, 129, 255, 256, 1000, 4096, 10000])
@pytest.mark.parametrize("seed_value", SEED_VALUES)
def test_sha256(length: int, seed_value: int):

    seed_random_generators(seed_value)
    text = get_random_string(length=length)
    expected = hashlib.sha256(text.encode()).digest()

    # One-shot: standalone function
    assert sz.sha256(text) == expected

    # One-shot: Str method
    assert Str(text).sha256() == expected

    # One-shot: bytes input
    assert sz.sha256(text.encode()) == expected

    # Progressive: single update
    h = sz.Sha256()
    h.update(text)
    assert h.digest() == expected
    assert h.hexdigest() == expected.hex()

    # Progressive: chunked updates
    if length > 0:
        h = sz.Sha256()
        chunk_size = max(1, length // 3)
        for i in range(0, length, chunk_size):
            h.update(text[i : i + chunk_size])
        assert h.digest() == expected
        assert h.hexdigest() == expected.hex()

    # Reset
    h.reset().update(text)
    assert h.digest() == expected

    # Copy
    mid = length // 2
    h1 = sz.Sha256().update(text[:mid])
    h2 = h1.copy()
    h1.update(text[mid:])
    h2.update(text[mid:])
    assert h1.digest() == h2.digest() == expected


@pytest.mark.parametrize("key_length", [0, 1, 16, 32, 64, 65, 128])
@pytest.mark.parametrize("message_length", [0, 1, 63, 64, 65, 127, 128, 1000])
@pytest.mark.parametrize("seed_value", SEED_VALUES)
def test_hmac_sha256(key_length: int, message_length: int, seed_value: int):

    seed_random_generators(seed_value)
    key = get_random_string(length=key_length).encode()
    message = get_random_string(length=message_length).encode()

    # Test against Python's hmac module
    expected = hmac.new(key, message, hashlib.sha256).digest()
    result = sz.hmac_sha256(key, message)
    assert result == expected

    # Test with string inputs
    key_str = key.decode("latin1")
    message_str = message.decode("latin1")
    result_str = sz.hmac_sha256(key_str, message_str)
    assert result_str == expected


@pytest.mark.parametrize("list_length", [10, 20, 30, 40, 50])
@pytest.mark.parametrize("part_length", [5, 10])
@pytest.mark.parametrize("variability", [2, 3])
@pytest.mark.parametrize("seed_value", SEED_VALUES)
def test_fuzzy_sorting(list_length: int, part_length: int, variability: int, seed_value: int):
    seed_random_generators(seed_value)
    native_list = [get_random_string(variability=variability, length=part_length) for _ in range(list_length)]
    native_joined = ".".join(native_list)
    big_joined = Str(native_joined)
    big_list = big_joined.split(".")

    # Before testing sorting, validate pairwise comparator consistency
    def py_cmp(a: str, b: str) -> int:
        return -1 if a < b else (1 if a > b else 0)

    def sz_cmp(a: str, b: str) -> int:
        sa, sb = Str(a), Str(b)
        if sa < sb:
            return -1
        if sa > sb:
            return 1
        return 0

    # Check every consecutive pair a[i], a[i+1]
    for i in range(len(native_list) - 1):
        a, b = native_list[i], native_list[i + 1]
        assert py_cmp(a, b) == sz_cmp(a, b), f"Comparator mismatch at {i}: '{a}' vs '{b}'"

    native_ordered = sorted(native_list)
    native_order = big_list.argsort()
    for i in range(list_length):
        assert native_ordered[i] == native_list[native_order[i]], "Order is wrong"
        assert native_ordered[i] == str(big_list[int(native_order[i])]), "Split is wrong?!"

    native_list.sort()
    big_list = big_list.sorted()

    assert len(native_list) == len(big_list)
    for native_str, big_str in zip(native_list, big_list):
        assert native_str == str(big_str), "Order is wrong"


@pytest.mark.skipif(not pyarrow_available, reason="PyArrow is not installed")
def test_str_to_pyarrow_conversion():
    native = "hello"
    big = Str(native)
    assert isinstance(big.address, int) and big.address != 0
    assert isinstance(big.nbytes, int) and big.nbytes == len(native)

    arrow_buffer = pa.foreign_buffer(big.address, big.nbytes, big)
    assert arrow_buffer.to_pybytes() == native.encode("utf-8")


@pytest.mark.parametrize("container_class", [tuple, list, iter])
@pytest.mark.parametrize("view", [False, True])
def test_strs_from_python_basic(container_class: type, view: bool):
    """Test basic conversion from Python containers to Strs."""
    base_items = ["hello", "world", "test", " ", "from", " ", "container", ""]
    container = container_class(base_items)

    # Skip iter+view combination as it's not supported
    if container_class == iter and view:
        with pytest.raises(ValueError, match="View mode.*not supported for iterators"):
            Strs(container, view=view)
        return

    strs = Strs(container, view=view)

    assert len(strs) == len(base_items)
    assert strs[0] == "hello"
    assert strs[1] == "world"
    assert strs[2] == "test"
    assert strs[3] == " "
    assert strs[4] == "from"
    assert strs[5] == " "
    assert strs[6] == "container"
    assert strs[7] == ""


UINT32_MAX = 2**32 - 1  # ! Many of the 32/64-bit algo corner cases happen at this input size


def long_repeated_string(ctypes, fill_char: str, string_size: int) -> str:
    buffer = ctypes.create_string_buffer(string_size)
    ctypes.memset(buffer, ord(fill_char), string_size)
    return buffer.value.decode("ascii")


@pytest.mark.skipif(sys.maxsize <= 2**32, reason="64-bit system required for 4GB+ test")
def test_strs_from_4gb_list():
    """Test Strs with >4GB array of strings to verify 32-bit to 64-bit layout transition.
    This will require over 8 GB of memory. To stress-test the behavior, limit memory per process. For 5 and 13 GB:

    ulimit -v 9437184 && uv run --no-project python -m pytest scripts/test_stringzilla.py -s -x -k 4gb_list
    ulimit -v 13631488 && uv run --no-project python -m pytest scripts/test_stringzilla.py -s -x -k 4gb_list
    """

    try:
        import gc
        import ctypes
    except ImportError:
        pytest.skip("ctypes & gc not available (e.g., PyPy)")

    # Each individual string won't be very large, but many of them will be used
    part_size = 64 * 1024 * 1024
    parts_count = math.ceil(UINT32_MAX / part_size) + 1  # Ensures we exceed UINT32_MAX by a small margin
    try:
        parts_pythonic = [
            long_repeated_string(ctypes, ascii_lowercase[part_index % len(ascii_lowercase)], part_size)
            for part_index in range(parts_count)
        ]
        parts_stringzilla = Strs(parts_pythonic)

        # Basic verification
        last_used_char = ascii_lowercase[(parts_count - 1) % len(ascii_lowercase)]
        assert len(parts_stringzilla) == parts_count
        assert parts_stringzilla[0] == long_repeated_string(ctypes, "a", part_size)
        assert parts_stringzilla[-1] == long_repeated_string(ctypes, last_used_char, part_size)

        del parts_pythonic
        del parts_stringzilla
    except (MemoryError, OSError):
        pytest.skip("Memory allocation failed")
    finally:
        gc.collect()


@pytest.mark.skipif(sys.maxsize <= 2**32, reason="64-bit system required for 4GB+ test")
def test_strs_from_4gb_generator():
    """Test Strs with >4GB of strings streams to verify 32-bit to 64-bit layout transition.
    This will require over 8 GB of memory. To stress-test the behavior, limit memory per process. For 5 and 13 GB:

    ulimit -v 5242880 && uv run --no-project python -m pytest scripts/test_stringzilla.py -s -x -k 4gb_generator
    ulimit -v 13631488 && uv run --no-project python -m pytest scripts/test_stringzilla.py -s -x -k 4gb_generator
    """

    try:
        import gc
        import ctypes
    except ImportError:
        pytest.skip("ctypes & gc not available (e.g., PyPy)")

    # Each individual string won't be very large, but many of them will be used
    part_size = 64 * 1024 * 1024
    parts_count = math.ceil(UINT32_MAX / part_size) + 1  # Ensures we exceed UINT32_MAX by a small margin
    try:
        parts_stringzilla = Strs(
            long_repeated_string(ctypes, ascii_lowercase[part_index % len(ascii_lowercase)], part_size)
            for part_index in range(parts_count)
        )

        # Basic verification
        last_used_char = ascii_lowercase[(parts_count - 1) % len(ascii_lowercase)]
        assert len(parts_stringzilla) == parts_count
        assert parts_stringzilla[0] == long_repeated_string(ctypes, "a", part_size)
        assert parts_stringzilla[-1] == long_repeated_string(ctypes, last_used_char, part_size)

        del parts_stringzilla
    except (MemoryError, OSError):
        pytest.skip("Memory allocation failed")
    finally:
        gc.collect()


@pytest.mark.parametrize("container_class", [tuple, list, iter])
@pytest.mark.parametrize("view", [False, True])
def test_strs_reference_counting(container_class: type, view: bool):
    """Test reference counting to prevent memory leaks."""

    # CPython-only: PyPy and other interpreters may not expose refcounts or use a different GC model
    if not hasattr(sys, "getrefcount"):
        pytest.skip("Reference counting semantics are not available")

    import gc

    base_items = ["ref", "count", "test"]
    container = container_class(base_items)

    # Skip iter+view combination as it's not supported
    if container_class == iter and view:
        with pytest.raises(ValueError, match="View mode.*not supported for iterators"):
            Strs(container, view=view)
        return

    initial_refcount = sys.getrefcount(container)

    strs = Strs(container, view=view)
    during_refcount = sys.getrefcount(container)

    # For iterators, we can't check refcount behavior the same way since iter() creates a new object
    # and the iterator consumes the original container during iteration
    if container_class != iter:
        # View mode should increment refcount, copy mode should not
        if view:
            assert during_refcount == initial_refcount + 1, "View mode should increment refcount"
        else:
            assert during_refcount == initial_refcount, "Copy mode should not change refcount"

    # Verify functionality
    assert len(strs) == 3
    assert strs[0] == "ref"
    assert strs[1] == "count"
    assert strs[2] == "test"

    del strs
    gc.collect()

    if container_class != iter:
        final_refcount = sys.getrefcount(container)
        assert final_refcount == initial_refcount, "Refcount should return to initial value"


@pytest.mark.skipif(not pyarrow_available, reason="PyArrow is not installed")
@pytest.mark.parametrize("view", [False, True])
def test_strs_from_arrow_basic(view: bool):
    """Test basic conversion from Arrow string array to Strs."""
    arrow_array = pa.array(["hello", "world", "test", "arrow"])
    strs = Strs(arrow_array, view=view)

    assert len(strs) == 4
    assert strs[0] == "hello"
    assert strs[1] == "world"
    assert strs[2] == "test"
    assert strs[3] == "arrow"


@pytest.mark.skipif(not pyarrow_available, reason="PyArrow is not installed")
def test_strs_from_arrow_binary_array():
    """Test conversion from Arrow binary array."""
    binary_data = [b"hello", b"world", b"binary", b"data"]
    arrow_array = pa.array(binary_data, type=pa.binary())

    strs = Strs(arrow_array)

    assert len(strs) == 4
    # Strs should handle binary data properly - compare as bytes
    for i, expected in enumerate(binary_data):
        str_bytes = strs[i].encode("latin-1") if isinstance(strs[i], str) else bytes(strs[i])
        assert str_bytes == expected


@pytest.mark.skipif(not pyarrow_available, reason="PyArrow is not installed")
def test_strs_from_arrow_large_strings():
    """Test conversion from Arrow large string array."""
    arrow_array = pa.array(["hello", "world", "large", "strings"], type=pa.large_string())

    strs = Strs(arrow_array)

    assert len(strs) == 4
    assert strs[0] == "hello"
    assert strs[1] == "world"
    assert strs[2] == "large"
    assert strs[3] == "strings"


@pytest.mark.skipif(not pyarrow_available, reason="PyArrow is not installed")
def test_strs_from_arrow_error_cases():
    """Test error handling for invalid inputs."""
    # Test with non-iterable object
    with pytest.raises(TypeError):
        Strs(123)  # Integer is not iterable

    with pytest.raises(TypeError):
        Strs(None)  # None is not iterable

    # Test with non-string Arrow array
    int_array = pa.array([1, 2, 3, 4])
    with pytest.raises((TypeError, ValueError)):
        Strs(int_array)


@pytest.mark.skipif(not pyarrow_available, reason="PyArrow is not installed")
def test_strs_from_arrow_c_interface():
    """Test the low-level Arrow C Data Interface."""
    arrow_array = pa.array(["test", "c", "interface"])

    # Check that Arrow array has the __arrow_c_array__ method
    assert hasattr(arrow_array, "__arrow_c_array__")

    # Get the C interface capsules
    schema_capsule, array_capsule = arrow_array.__arrow_c_array__()

    # Verify capsules are valid PyCapsule objects
    if sys.version_info >= (3, 1):
        assert str(type(schema_capsule)) == "<class 'PyCapsule'>"
        assert str(type(array_capsule)) == "<class 'PyCapsule'>"

    # Test actual conversion
    strs = Strs(arrow_array)
    assert len(strs) == 3
    assert strs[0] == "test"
    assert strs[1] == "c"
    assert strs[2] == "interface"


@pytest.mark.skipif(not pyarrow_available, reason="PyArrow is not installed")
def test_strs_from_arrow_with_nulls():
    """Test conversion from Arrow array with null values (validity bits)."""
    # Create an array with None values
    arrow_array = pa.array(["hello", None, "world", None, "test"])

    strs = Strs(arrow_array)

    assert len(strs) == 5
    assert strs[0] == "hello"
    assert strs[1] == ""  # None values should be converted to empty strings
    assert strs[2] == "world"
    assert strs[3] == ""  # None values should be converted to empty strings
    assert strs[4] == "test"

    # Test with all nulls
    all_nulls = pa.array([None, None, None], type=pa.string())
    strs_nulls = Strs(all_nulls)
    assert len(strs_nulls) == 3
    assert all(s == "" for s in strs_nulls)

    # Test with mixed None and empty strings
    mixed = pa.array(["", None, "hello", "", None, "world"])
    strs_mixed = Strs(mixed)
    assert len(strs_mixed) == 6
    assert strs_mixed[0] == ""
    assert strs_mixed[1] == ""
    assert strs_mixed[2] == "hello"
    assert strs_mixed[3] == ""
    assert strs_mixed[4] == ""
    assert strs_mixed[5] == "world"


def test_invalid_utf8_handling():
    """Test that both Str and Strs handle invalid UTF-8 bytes gracefully."""

    # Test arrays with invalid UTF-8 sequences
    test_arrays = [
        # Use `bytes.fromhex()` to avoid putting binary literals in source code
        # This prevents PyTest's source parsing from encountering invalid UTF-8
        [b"hello", bytes.fromhex("80") + b"world", b"valid"],  # Mixed valid/invalid
        [bytes.fromhex("fffe"), bytes.fromhex("80"), bytes.fromhex("f4908080")],  # All invalid
        [b"normal", b"string with " + bytes.fromhex("80") + b" bytes"],  # Partial invalid
    ]

    for test_array in test_arrays:
        strs_obj = Strs(test_array)

        # These should never raise exceptions
        repr_result = repr(strs_obj)
        str_result = str(strs_obj)

        # Basic assertions
        assert isinstance(repr_result, str)
        assert isinstance(str_result, str)
        assert len(repr_result) > 0
        assert len(str_result) > 0


def test_unit_utf8_case_fold():
    """Test basic case folding functionality."""
    # ASCII
    assert sz.utf8_case_fold("HELLO") == b"hello"
    assert sz.utf8_case_fold("Hello World") == b"hello world"
    assert sz.utf8_case_fold("") == b""
    assert sz.utf8_case_fold("already lowercase") == b"already lowercase"

    # German sharp S expansion (ß → ss)
    assert sz.utf8_case_fold("Straße") == b"strasse"
    assert sz.utf8_case_fold("GROSSE") == b"grosse"

    # Method form on Str
    assert sz.Str("HELLO").utf8_case_fold() == b"hello"

    # Bytes input
    assert sz.utf8_case_fold(b"HELLO") == b"hello"


@pytest.mark.parametrize(
    "input_str, expected",
    [
        ("A", b"a"),
        ("Z", b"z"),
        ("ß", b"ss"),  # German sharp S (U+00DF)
        ("ẞ", b"ss"),  # Capital sharp S (U+1E9E)
        ("ﬁ", b"fi"),  # fi ligature (U+FB01)
        ("ﬀ", b"ff"),  # ff ligature (U+FB00)
        ("ﬂ", b"fl"),  # fl ligature (U+FB02)
        ("ﬃ", b"ffi"),  # ffi ligature (U+FB03)
        ("ﬄ", b"ffl"),  # ffl ligature (U+FB04)
        ("Σ", "σ".encode("utf-8")),  # Greek Sigma
        ("Ω", "ω".encode("utf-8")),  # Greek Omega
        ("Ä", "ä".encode("utf-8")),  # German umlaut
        ("É", "é".encode("utf-8")),  # French accent
        ("Ñ", "ñ".encode("utf-8")),  # Spanish tilde
    ],
)
def test_utf8_case_fold_expansions(input_str, expected):
    """Test case folding with specific known transformations including expansions."""
    assert sz.utf8_case_fold(input_str) == expected


# Backward compatibility aliases
_parse_case_folding_file = parse_case_folding_file
_get_case_folding_rules = get_case_folding_rules


def test_utf8_case_fold_all_codepoints():
    """Compare StringZilla case folding with Unicode 17.0 CaseFolding.txt rules.

    This test downloads the official Unicode 17.0 case folding data file to validate
    StringZilla's implementation, independent of Python's Unicode version.
    The file is cached in the system temp directory for subsequent runs.
    """
    # Load Unicode 17.0 case folding rules (downloads and caches automatically)
    unicode_folds = _get_case_folding_rules("17.0.0")

    mismatches = []
    missing_folds = []
    extra_folds = []

    for codepoint in range(0x110000):
        # Skip surrogates (not valid in UTF-8)
        if 0xD800 <= codepoint <= 0xDFFF:
            continue

        try:
            char = chr(codepoint)
            char_bytes = char.encode("utf-8")
            sz_folded = sz.utf8_case_fold(char)

            # Get expected folding from Unicode 17.0 rules
            # If not in the table, character maps to itself
            expected = unicode_folds.get(codepoint, char_bytes)

            if sz_folded != expected:
                entry = (f"U+{codepoint:04X}", repr(char), expected.hex(), sz_folded.hex())
                if codepoint in unicode_folds and sz_folded == char_bytes:
                    missing_folds.append(entry)  # StringZilla didn't fold but should have
                elif codepoint not in unicode_folds and sz_folded != char_bytes:
                    extra_folds.append(entry)  # StringZilla folded but shouldn't have
                else:
                    mismatches.append(entry)  # Both fold but to different targets
        except (ValueError, UnicodeEncodeError):
            continue

    total_errors = len(mismatches) + len(missing_folds) + len(extra_folds)
    assert total_errors == 0, (
        f"Case folding errors vs Unicode 17.0 ({len(unicode_folds)} rules): "
        f"{len(mismatches)} wrong targets, {len(missing_folds)} missing, {len(extra_folds)} extra. "
        f"First 10: {(mismatches + missing_folds + extra_folds)[:10]}"
    )


@pytest.mark.parametrize("seed_value", SEED_VALUES)
def test_utf8_case_fold_random_strings(seed_value: int):
    """Test case folding on random multi-codepoint strings."""
    seed(seed_value)

    # Test with ASCII uppercase
    for _ in range(50):
        length = randint(1, 100)
        test_str = "".join(chr(randint(0x41, 0x5A)) for _ in range(length))  # A-Z
        python_folded = test_str.casefold().encode("utf-8")
        sz_folded = sz.utf8_case_fold(test_str)
        assert python_folded == sz_folded, f"Mismatch for: {test_str!r}"

    # Test with Latin Extended characters
    for _ in range(50):
        length = randint(1, 50)
        # Mix of ASCII uppercase and Latin Extended (includes ß, etc.)
        codepoints = [randint(0x41, 0x5A) for _ in range(length)]
        codepoints += [randint(0xC0, 0xFF) for _ in range(length // 2)]
        test_str = "".join(chr(cp) for cp in codepoints)
        python_folded = test_str.casefold().encode("utf-8")
        sz_folded = sz.utf8_case_fold(test_str)
        assert python_folded == sz_folded, f"Mismatch for: {test_str!r}"


def test_utf8_case_insensitive_find():
    """Test case-insensitive UTF-8 substring search."""
    # Basic ASCII
    assert sz.utf8_case_insensitive_find("Hello World", "WORLD") == 6
    assert sz.utf8_case_insensitive_find("Hello World", "hello") == 0
    assert sz.utf8_case_insensitive_find("Hello World", "xyz") == -1

    # Case variations
    assert sz.utf8_case_insensitive_find("HELLO", "hello") == 0
    assert sz.utf8_case_insensitive_find("hello", "HELLO") == 0
    assert sz.utf8_case_insensitive_find("HeLLo WoRLd", "world") == 6

    # German sharp S: ß folds to "ss"
    assert sz.utf8_case_insensitive_find("Straße", "STRASSE") == 0
    assert sz.utf8_case_insensitive_find("STRASSE", "straße") == 0
    assert sz.utf8_case_insensitive_find("die Straße", "STRASSE") == 4

    # Greek letters
    assert sz.utf8_case_insensitive_find("ΑΒΓΔ", "αβγδ") == 0
    assert sz.utf8_case_insensitive_find("αβγδ", "ΑΒΓΔ") == 0

    # Cyrillic
    assert sz.utf8_case_insensitive_find("ПРИВЕТ", "привет") == 0
    assert sz.utf8_case_insensitive_find("привет", "ПРИВЕТ") == 0

    # Empty cases
    assert sz.utf8_case_insensitive_find("hello", "") == 0
    assert sz.utf8_case_insensitive_find("", "x") == -1
    assert sz.utf8_case_insensitive_find("", "") == 0

    # Partial matches
    assert sz.utf8_case_insensitive_find("abcdef", "CD") == 2
    assert sz.utf8_case_insensitive_find("ABCDEF", "cd") == 2

    # Method form on Str
    s = sz.Str("Hello World")
    assert s.utf8_case_insensitive_find("WORLD") == 6


def test_utf8_case_insensitive_order():
    """Test case-insensitive UTF-8 comparison."""
    # Equal strings
    assert sz.utf8_case_insensitive_order("hello", "HELLO") == 0
    assert sz.utf8_case_insensitive_order("HELLO", "hello") == 0
    assert sz.utf8_case_insensitive_order("HeLLo", "hElLO") == 0

    # German sharp S equivalence
    assert sz.utf8_case_insensitive_order("Straße", "STRASSE") == 0
    assert sz.utf8_case_insensitive_order("strasse", "STRAẞE") == 0

    # Less than
    assert sz.utf8_case_insensitive_order("apple", "BANANA") < 0
    assert sz.utf8_case_insensitive_order("APPLE", "banana") < 0
    assert sz.utf8_case_insensitive_order("a", "B") < 0

    # Greater than
    assert sz.utf8_case_insensitive_order("ZEBRA", "apple") > 0
    assert sz.utf8_case_insensitive_order("zebra", "APPLE") > 0
    assert sz.utf8_case_insensitive_order("z", "A") > 0

    # Empty strings
    assert sz.utf8_case_insensitive_order("", "") == 0
    assert sz.utf8_case_insensitive_order("", "a") < 0
    assert sz.utf8_case_insensitive_order("a", "") > 0

    # Prefix ordering
    assert sz.utf8_case_insensitive_order("hello", "HELLO WORLD") < 0
    assert sz.utf8_case_insensitive_order("HELLO WORLD", "hello") > 0

    # Greek letters
    assert sz.utf8_case_insensitive_order("ΑΒΓΔ", "αβγδ") == 0
    assert sz.utf8_case_insensitive_order("αβγ", "ΑΒΓ") == 0

    # Cyrillic
    assert sz.utf8_case_insensitive_order("ПРИВЕТ", "привет") == 0

    # Method form on Str
    s = sz.Str("hello")
    assert s.utf8_case_insensitive_order("HELLO") == 0


def test_unit_utf8_count():
    """Test UTF-8 character counting (codepoints, not bytes)."""
    # ASCII strings: len == utf8_count
    assert sz.utf8_count("hello") == 5
    assert sz.utf8_count("") == 0
    assert sz.utf8_count("a") == 1

    # Multi-byte UTF-8: character count != byte count
    assert sz.utf8_count("café") == 4  # é is 2 bytes but 1 char
    assert sz.utf8_count("日本語") == 3  # Each CJK char is 3 bytes
    assert sz.utf8_count("🎉") == 1  # Emoji is 4 bytes but 1 char
    assert sz.utf8_count("👨‍👩‍👧") == 5  # Family emoji: 3 people + 2 ZWJ

    # Mixed ASCII and multi-byte
    assert sz.utf8_count("hello世界") == 7  # 5 ASCII + 2 CJK
    assert sz.utf8_count("naïve") == 5  # ï is 2 bytes

    # Method form on Str
    assert sz.Str("café").utf8_count() == 4

    # Bytes input
    assert sz.utf8_count(b"caf\xc3\xa9") == 4  # café in UTF-8 bytes

    # Various multi-byte sequences
    assert sz.utf8_count("αβγδ") == 4  # Greek letters (2 bytes each)
    assert sz.utf8_count("АБВГ") == 4  # Cyrillic letters (2 bytes each)
    assert sz.utf8_count("𐍈") == 1  # Gothic letter (4 bytes)


def test_utf8_splitlines_iter():
    """Test UTF-8 line splitting iterator."""
    # Basic newline characters
    assert list(sz.utf8_splitlines_iter("a\nb\nc")) == [Str("a"), Str("b"), Str("c")]
    assert list(sz.utf8_splitlines_iter("a\rb\rc")) == [Str("a"), Str("b"), Str("c")]
    assert list(sz.utf8_splitlines_iter("a\r\nb\r\nc")) == [Str("a"), Str("b"), Str("c")]

    # CRLF treated as single delimiter
    result = list(sz.utf8_splitlines_iter("line1\r\nline2"))
    assert len(result) == 2
    assert result[0] == Str("line1")
    assert result[1] == Str("line2")

    # Empty string: yields one empty Str (differs from Python's splitlines which returns [])
    assert list(sz.utf8_splitlines_iter("")) == [Str("")]

    # No newlines
    assert list(sz.utf8_splitlines_iter("hello")) == [Str("hello")]

    # Consecutive newlines create empty lines
    result = list(sz.utf8_splitlines_iter("a\n\nb"))
    assert len(result) == 3
    assert result[1] == Str("")  # Empty line between

    # Trailing newline: includes trailing empty (differs from Python which omits it)
    result = list(sz.utf8_splitlines_iter("a\nb\n"))
    assert len(result) == 3
    assert result[2] == Str("")

    # keepends=True
    result = list(sz.utf8_splitlines_iter("a\nb\nc", keepends=True))
    assert result[0] == Str("a\n")
    assert result[1] == Str("b\n")
    assert result[2] == Str("c")  # Last line has no newline

    # keepends=True with CRLF
    result = list(sz.utf8_splitlines_iter("a\r\nb", keepends=True))
    assert result[0] == Str("a\r\n")
    assert result[1] == Str("b")

    # Unicode newlines: vertical tab, form feed
    assert len(list(sz.utf8_splitlines_iter("a\vb\fc"))) == 3

    # Unicode newlines: NEL (U+0085), Line Separator (U+2028), Paragraph Separator (U+2029)
    nel = "\u0085"
    line_sep = "\u2028"
    para_sep = "\u2029"
    assert len(list(sz.utf8_splitlines_iter(f"a{nel}b"))) == 2
    assert len(list(sz.utf8_splitlines_iter(f"a{line_sep}b"))) == 2
    assert len(list(sz.utf8_splitlines_iter(f"a{para_sep}b"))) == 2

    # Method form on Str
    result = list(sz.Str("a\nb").utf8_splitlines_iter())
    assert result == [Str("a"), Str("b")]

    # skip_empty=True skips ALL empty segments (leading, middle, trailing)
    # This differs from Python's splitlines() which keeps middle empty lines
    assert list(sz.utf8_splitlines_iter("", skip_empty=True)) == []
    assert list(sz.utf8_splitlines_iter("a\nb\n", skip_empty=True)) == [Str("a"), Str("b")]
    assert list(sz.utf8_splitlines_iter("a\n\nb", skip_empty=True)) == [Str("a"), Str("b")]  # skips middle empty
    assert list(sz.utf8_splitlines_iter("\na\n", skip_empty=True)) == [Str("a")]  # skips leading & trailing


@pytest.mark.parametrize(
    "text",
    ["hello\nworld", "a\r\nb\nc", "no newline", "trailing\n"],
)
def test_utf8_splitlines_iter_matches_python(text):
    """Verify skip_empty=True matches Python's splitlines() for cases without middle empty lines."""
    py_result = text.splitlines()
    sz_result = [str(s) for s in sz.utf8_splitlines_iter(text, skip_empty=True)]
    assert sz_result == py_result


def test_utf8_split_iter():
    """Test UTF-8 whitespace splitting iterator."""
    # Basic whitespace: space, tab, newline
    result = list(sz.utf8_split_iter("hello world"))
    assert result == [Str("hello"), Str("world")]

    result = list(sz.utf8_split_iter("a\tb\nc"))
    assert result == [Str("a"), Str("b"), Str("c")]

    # Empty string: yields one empty Str (differs from Python's split which returns [])
    assert list(sz.utf8_split_iter("")) == [Str("")]

    # String with only whitespace: yields empty strings between delimiters
    # (differs from Python's split() which returns [] for whitespace-only strings)
    result = list(sz.utf8_split_iter("   "))
    assert all(str(s) == "" for s in result)

    # Consecutive whitespace creates empty strings between each delimiter
    # (differs from Python's split() which treats consecutive whitespace as single delimiter)
    result = list(sz.utf8_split_iter("a   b"))
    assert len(result) == 4  # a, '', '', b
    assert str(result[0]) == "a"
    assert str(result[-1]) == "b"

    # Leading/trailing whitespace creates empty strings
    result = list(sz.utf8_split_iter("  hello  "))
    assert "hello" in [str(s) for s in result]

    # No whitespace
    assert list(sz.utf8_split_iter("hello")) == [Str("hello")]

    # Unicode whitespace: NO-BREAK SPACE (U+00A0)
    nbsp = "\u00a0"
    result = list(sz.utf8_split_iter(f"a{nbsp}b"))
    assert len(result) == 2
    assert str(result[0]) == "a"
    assert str(result[1]) == "b"

    # Unicode whitespace: IDEOGRAPHIC SPACE (U+3000)
    ideo_space = "\u3000"
    result = list(sz.utf8_split_iter(f"日本{ideo_space}語"))
    assert len(result) == 2
    assert str(result[0]) == "日本"
    assert str(result[1]) == "語"

    # Unicode whitespace: EN SPACE (U+2002), EM SPACE (U+2003)
    en_space = "\u2002"
    em_space = "\u2003"
    result = list(sz.utf8_split_iter(f"a{en_space}b{em_space}c"))
    assert len(result) == 3
    assert str(result[0]) == "a"
    assert str(result[1]) == "b"
    assert str(result[2]) == "c"

    # Method form on Str
    result = list(sz.Str("hello world").utf8_split_iter())
    assert len(result) == 2
    assert str(result[0]) == "hello"
    assert str(result[1]) == "world"


@pytest.mark.parametrize(
    "text",
    ["hello world", "  spaced  ", "a\tb\nc", "no-spaces", "   ", ""],
)
def test_utf8_split_iter_matches_python(text):
    """Verify skip_empty=True matches Python's split() behavior."""
    py_result = text.split()
    sz_result = [str(s) for s in sz.utf8_split_iter(text, skip_empty=True)]
    assert sz_result == py_result


if __name__ == "__main__":
    sys.exit(pytest.main(["-x", "-s", __file__]))
